Chapter 3 Data statistics
3.1 Sequencing reads statistics
sample_metadata %>%
summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2),
mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
tt()| Total | Average |
|---|---|
| 171.92 | 4.91 ± 1.46 |
3.2 DNA fractions
sequence_fractions <- read_counts %>%
pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
group_by(sample) %>%
summarise(mags = sum(value)) %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
mutate(mags_bases = mags*146) %>%
mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)
sequence_fractions %>%
mutate_at(vars(-sample), ~./1000000000) %>%
rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
tt()| Sample | Low quality | Mapped to host | Unmapped | Mapped to MAGs |
|---|---|---|---|---|
| EHI01966 | 0.3568965 | 0.005030679 | 1.9178067 | 2.507921 |
| EHI01967 | 0.2692620 | 0.000709418 | 0.8339877 | 4.617569 |
| EHI01968 | 0.3467658 | 0.000167184 | 1.2259195 | 5.669136 |
| EHI01970 | 0.2568282 | 0.000235551 | 1.1356328 | 3.939774 |
| EHI01979 | 0.3068304 | 0.000429188 | 1.1643291 | 3.963429 |
| EHI01982 | 0.2321305 | 0.003106822 | 1.2765585 | 2.120459 |
| EHI01985 | 0.3350121 | 0.000294977 | 1.3705302 | 4.091743 |
| EHI01989 | 0.3249292 | 0.000570792 | 1.2444290 | 3.406020 |
| EHI01992 | 0.2112149 | 0.000650860 | 1.3810604 | 2.863645 |
| EHI01995 | 0.2441431 | 0.001014484 | 1.0559948 | 2.625781 |
| EHI02065 | 0.2881934 | 0.000248787 | 0.9495023 | 3.524029 |
| EHI02067 | 0.4887927 | 0.009035522 | 1.5062498 | 3.314287 |
| EHI02079 | 0.4087200 | 0.001541555 | 1.0763461 | 3.216692 |
| EHI02088 | 0.5380240 | 0.002508850 | 1.7069668 | 4.880649 |
| EHI02097 | 1.1086051 | 0.001111047 | 1.1464450 | 4.177690 |
| EHI02105 | 0.4367774 | 0.006671326 | 0.7868596 | 1.606939 |
| EHI02582 | 0.1565447 | 0.000041661 | 0.7099037 | 3.061117 |
| EHI02584 | 0.2131393 | 0.000243149 | 1.3719978 | 3.909083 |
| EHI02585 | 0.3055922 | 0.000693419 | 1.3837783 | 4.156742 |
| EHI02587 | 0.2002879 | 0.001053113 | 1.1718219 | 3.773711 |
| EHI02592 | 0.2204754 | 0.000807086 | 1.0309674 | 3.998601 |
| EHI02598 | 0.1922634 | 0.000085434 | 1.1517125 | 3.984019 |
| EHI02602 | 0.1798841 | 0.000148220 | 1.2512663 | 2.802077 |
| EHI02603 | 0.2165665 | 0.002208379 | 1.3526444 | 3.734602 |
| EHI02607 | 0.2170496 | 0.000239951 | 1.5274046 | 3.978722 |
| EHI02612 | 0.1376030 | 0.000170810 | 0.6921896 | 2.540514 |
| EHI02615 | 0.1893992 | 0.000351599 | 0.6650246 | 1.618650 |
| EHI02617 | 0.1957782 | 0.000243466 | 1.3981847 | 2.879747 |
| EHI02619 | 0.2292718 | 0.000086283 | 0.9636172 | 3.865591 |
| EHI02624 | 0.2077297 | 0.000351284 | 1.0437963 | 3.443889 |
| EHI02625 | 0.1169758 | 0.000023503 | 0.5396744 | 1.996437 |
| EHI02630 | 0.1994379 | 0.000258866 | 0.9279985 | 3.240507 |
| EHI02632 | 0.3725667 | 0.000748839 | 1.4198031 | 2.929934 |
| EHI02633 | 0.2780827 | 0.000903128 | 1.3810886 | 5.477885 |
| EHI02639 | 0.4685676 | 0.001126284 | 2.1791314 | 8.022893 |
sequence_fractions %>%
mutate_at(vars(-sample), ~./1000000000) %>%
rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
tt()| Low quality | Mapped to host | Unmapped | Mapped to MAGs |
|---|---|---|---|
| 0.2985812 | 0.001231758 | 1.198304 | 3.598299 |
sequence_fractions %>%
pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
mutate(value = value / 1000000000) %>%
mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
ggplot(., aes(x = sample, y = value, fill=fraction)) +
geom_bar(position="stack", stat = "identity") +
scale_fill_manual(name="Sequence type",
breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
labs(x = "Samples", y = "Amount of data (GB)") +
theme_classic() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")3.3 Recovered microbial fraction
singlem_table <- sequence_fractions %>%
mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
left_join(sample_metadata, by = join_by(sample == sample)) %>%
mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
select(sample,mags_proportion,singlem_proportion) %>%
mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify
singlem_table %>%
pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
ggplot(., aes(x = value, y = sample, color=proportion)) +
geom_line(aes(group = sample), color = "#f8a538") +
geom_point() +
scale_color_manual(name="Proportion",
breaks=c("mags_proportion","singlem_proportion"),
labels=c("Recovered","Estimated"),
values=c("#52e1e8", "#876b53"))+
theme_classic() +
labs(y = "Samples", x = "Prokaryotic fraction (%)") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "right")damr <- singlem_table %>%
mutate(damr=ifelse(is.na(singlem_proportion),100,mags_proportion/singlem_proportion*100)) %>%
select(sample,damr)
damr %>%
tt()| sample | damr |
|---|---|
| EHI01966 | 100.00000 |
| EHI01967 | 100.00000 |
| EHI01968 | 100.00000 |
| EHI01970 | 100.00000 |
| EHI01979 | 100.00000 |
| EHI01982 | 100.00000 |
| EHI01985 | 100.00000 |
| EHI01989 | 100.00000 |
| EHI01992 | 100.00000 |
| EHI01995 | 100.00000 |
| EHI02065 | 100.00000 |
| EHI02067 | 100.00000 |
| EHI02079 | 100.00000 |
| EHI02088 | 100.00000 |
| EHI02097 | 100.00000 |
| EHI02105 | 100.00000 |
| EHI02582 | 100.00000 |
| EHI02584 | 99.78431 |
| EHI02585 | 100.00000 |
| EHI02587 | 100.00000 |
| EHI02592 | 100.00000 |
| EHI02598 | 100.00000 |
| EHI02602 | 100.00000 |
| EHI02603 | 100.00000 |
| EHI02607 | 100.00000 |
| EHI02612 | 100.00000 |
| EHI02615 | 100.00000 |
| EHI02617 | 100.00000 |
| EHI02619 | 100.00000 |
| EHI02624 | 100.00000 |
| EHI02625 | 100.00000 |
| EHI02630 | 100.00000 |
| EHI02632 | 100.00000 |
| EHI02633 | 100.00000 |
| EHI02639 | 100.00000 |
[1] 35
| mean | sd |
|---|---|
| 99.99384 | 0.03645853 |